0. Introduction
The purpose of this analysis is to determine which gene sets from MSigDB are most enriched in the genes we previously predicted to contain IREs. This is important as we wanted to see the biological relevance of the IRE-containing genes, and also their overlap in existing gene sets.
The approach is as follows:
- Load in relevant gene set collections from MSigDB.
- Get them into a format suitable for performing Fisher’s exact test to test whether they are enriched for IRE-containing genes.
- Results displayed on a bar chart and also as a network.
The motivation for doing this analysis comes from an interesting observation using the naive approach of searching MSigDB for all gene sets with iron or heme in their name and seeing the overlap in genes between these gene sets and the IRE-containing genesets we identified.
ironSets <- list(
c5_GO_2_IRON_2_SULFUR_CLUSTER_BINDING = c5_mapped$GO_2_IRON_2_SULFUR_CLUSTER_BINDING,
c5_GO_4_IRON_4_SULFUR_CLUSTER_BINDING = c5_mapped$GO_4_IRON_4_SULFUR_CLUSTER_BINDING,
c5_GO_CELLULAR_IRON_ION_HOMEOSTASIS = c5_mapped$GO_CELLULAR_IRON_ION_HOMEOSTASIS,
c5_GO_HEME_METABOLIC_PROCESS = c5_mapped$GO_HEME_METABOLIC_PROCESS,
c5_GO_HEME_BIOSYNTHETIC_PROCESS = c5_mapped$GO_HEME_BIOSYNTHETIC_PROCESS,
c5_GO_HEMOGLOBIN_COMPLEX = c5_mapped$GO_HEMOGLOBIN_COMPLEX,
c5_GO_IRON_COORDINATION_ENTITY_TRANSPORT = c5_mapped$GO_IRON_COORDINATION_ENTITY_TRANSPORT,
c5_GO_IRON_ION_BINDING = c5_mapped$GO_IRON_ION_BINDING,
c5_GO_IRON_ION_HOMEOSTASIS = c5_mapped$GO_IRON_ION_HOMEOSTASIS,
c5_GO_IRON_ION_IMPORT = c5_mapped$GO_IRON_ION_IMPORT,
c5_GO_IRON_ION_TRANSPORT = c5_mapped$GO_IRON_ION_TRANSPORT,
c5_GO_OXIDOREDUCTASE_ACTIVITY_ACTING_ON_A_HEME_GROUP_OF_DONORS = c5_mapped$GO_OXIDOREDUCTASE_ACTIVITY_ACTING_ON_A_HEME_GROUP_OF_DONORS,
c5_GO_RESPONSE_TO_IRON_ION = c5_mapped$GO_RESPONSE_TO_IRON_ION,
h_HEME_METABOLISM = h_mapped$HALLMARK_HEME_METABOLISM,
c2_REACTOME_IRON_UPTAKE_AND_TRANSPORT = c2_mapped$REACTOME_IRON_UPTAKE_AND_TRANSPORT
)
ironSets_idx <- ids2indices(ironSets, rownames(v))
head(ironSets,2)
$c5_GO_2_IRON_2_SULFUR_CLUSTER_BINDING
[1] "ENSDARG00000062780" "ENSDARG00000079516" "ENSDARG00000052703" "ENSDARG00000013044" "ENSDARG00000007745" "ENSDARG00000038154"
[7] "ENSDARG00000051956" "ENSDARG00000052190" "ENSDARG00000003462" "ENSDARG00000045733" "ENSDARG00000043665" "ENSDARG00000026582"
[13] "ENSDARG00000035596" "ENSDARG00000020054" "ENSDARG00000074356" "ENSDARG00000028546" "ENSDARG00000092713" "ENSDARG00000058725"
[19] "ENSDARG00000042159" "ENSDARG00000055240"
$c5_GO_4_IRON_4_SULFUR_CLUSTER_BINDING
[1] "ENSDARG00000055472" "ENSDARG00000021466" "ENSDARG00000074552" "ENSDARG00000010267" "ENSDARG00000086853" "ENSDARG00000042727"
[7] "ENSDARG00000062216" "ENSDARG00000027689" "ENSDARG00000026376" "ENSDARG00000036438" "ENSDARG00000074889" "ENSDARG00000004952"
[13] "ENSDARG00000035074" "ENSDARG00000052721" "ENSDARG00000051986" "ENSDARG00000092713" "ENSDARG00000077698" "ENSDARG00000021985"
[19] "ENSDARG00000104848" "ENSDARG00000110572" "ENSDARG00000042881" "ENSDARG00000004517" "ENSDARG00000062987" "ENSDARG00000054821"
[25] "ENSDARG00000038154" "ENSDARG00000051956" "ENSDARG00000011072" "ENSDARG00000055653" "ENSDARG00000058801" "ENSDARG00000007526"
[31] "ENSDARG00000026582" "ENSDARG00000035596" "ENSDARG00000058533" "ENSDARG00000022845" "ENSDARG00000038834" "ENSDARG00000028546"
[37] "ENSDARG00000045308" "ENSDARG00000007294" "ENSDARG00000078479" "ENSDARG00000074410"
head(ironSets_idx,2)
$c5_GO_2_IRON_2_SULFUR_CLUSTER_BINDING
[1] 392 1152 2009 3261 4076 4317 5266 5874 6672 7030 7494 7669 7725 7829 9213 10372 11992 13712
$c5_GO_4_IRON_4_SULFUR_CLUSTER_BINDING
[1] 582 666 1075 1117 1553 1678 3513 3591 3669 4053 4076 4219 4317 5167 5266 5472 5874 5996 6790 6835 7375
[22] 7669 7675 7834 8248 8392 8444 9159 9231 10196 10441 12008 12054 12192 13108 13371 14022 18609
We used a gene set enrichment analysis approach to test for whether these iron-related gene sets were enriched in our DE results.
source(here("R","GSEA","combinedGSEA.R"))
gseaResults_ironSets <- combinedGSEA(v, ironSets_idx, design, contrasts)
# gseaResults_ironSets %>% saveRDS(here("R","GSEA","results","gseaResults_ironSets.rds"))
Results for 6 month, normoxia, mutant vs. wild type below indicate that while some iron-related gene sets show enrichment in this comparison, others do not.
gseaResults_ironSets$combTest$normoxia_6mth_mutant_vs_wt
Overall, the figure below indicates poor overlap between existing gene sets and our IRE-containing gene sets. However, to really see how our IRE genesets relate to existing gene sets, we need to do a more comprehensive test including more of the gene sets not restricted to ones related to iron metabolism.
# Create a dataframe that includes overlap between iron-related gene sets and the ireGenes.
testx <- ironSets %>% lapply(function(x){
overlap <- list(
with3ire = sum(x %in% ireGenes$ire3_all),
with5ire = sum(x %in% ireGenes$ire5_all),
noIre = sum(!(x %in% ireGenes$ire3_all | x %in% ireGenes$ire5_all))
)
}) %>% lapply(function(x){
x %>% bind_rows
}) %>%
do.call("rbind",.) %>%
mutate(geneset = names(ironSets)) %>%
bind_rows(data.frame(
with3ire = length(ireGenes$ire3_all),
with5ire = length(ireGenes$ire5_all),
noIre = 0,
geneset = "- GRCz11 Zebrafish Genes - All Predicted IREs"
)) %>%
bind_rows(data.frame(
with3ire = ireUtr3 %>% as.data.frame %>% dplyr::filter(quality == "High") %>% use_series("gene_id") %>% unique %>% length,
with5ire = ireUtr5 %>% as.data.frame %>% dplyr::filter(quality == "High") %>% use_series("gene_id") %>% unique %>% length,
noIre = 0,
geneset = "- GRCz11 Zebrafish Genes - High Quality Predicted IREs"
)) %>%
mutate(geneset = gsub(x = geneset, pattern = "_", replacement = " "))%>%
melt
binding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorUsing geneset as id variables
# Create a stacked bar chart
compPlot <- testx %>% ggplot(aes(x = str_to_title(stringr::str_wrap(geneset, 20)),
y = value,
fill = variable)) +
geom_col() +
theme(aspect.ratio = 0.7,
axis.text.x = element_text(size= 35),
axis.title.x = element_text(size= 30),
axis.title.y = element_text(size = 30),
legend.text = element_text(size = 20),
legend.title = element_text(size=30),
axis.text.y = element_text(color = "grey20", size = 16)) +
coord_flip() + # Axis labels too long to be readable so this helps.
scale_fill_manual(values=c("#FBB829", "#FF0066", "#dddddd"), labels =c("3' IRE", "5' IRE", "No IRE")) +
labs(y = "Number of genes", x = "Gene set", fill = "Gene\nhas\nIRE?")
compPlot

1. Defining Relevant Gene Sets
We will be testing whether our IRE gene sets are enriched in any of the gene sets in the following gene set collections from MSigDB:
- Hallmark gene sets (H) are coherently expressed signatures derived by aggregating many MSigDB gene sets to represent well-defined biological states or processes.
- Curated gene sets (C2) are from online pathway databases, publications in PubMed, and knowledge of domain experts.
- Motif gene sets (C3) are based on conserved cis-regulatory motifs from a comparative analysis of the human, mouse, rat, and dog genomes.
- Gene ontology gene sets (C5) consist of genes annotated by the same GO terms.
The gene set collections mapped to zebrafish Ensembl IDs have been pre-loaded for this analysis. Each collection is a named list of gene sets, with each gene set containing Ensembl IDs of genes in that set. We will convert all lists into list column structures used in the tibble package.
h_tib <- h_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(h_mapped), source = "h")
c2_tib <- c2_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c2_mapped), source = "c2")
c3_tib <- c3_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c3_mapped), source = "c3")
c5_tib <- c5_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c5_mapped), source = "c5")
gs <- bind_rows(h_tib, c2_tib, c3_tib, c5_tib)
gs
We previously loaded the ireGenes R object, which has the following structure:
ireGenes%>%str
List of 4
$ ire3_all: chr [1:1207] "ENSDARG00000102878" "ENSDARG00000070477" "ENSDARG00000006038" "ENSDARG00000063481" ...
$ ire5_all: chr [1:393] "ENSDARG00000045911" "ENSDARG00000017386" "ENSDARG00000045886" "ENSDARG00000015921" ...
$ ire3_hq : chr [1:106] "ENSDARG00000074223" "ENSDARG00000024874" "ENSDARG00000110878" "ENSDARG00000089296" ...
$ ire5_hq : chr [1:49] "ENSDARG00000045911" "ENSDARG00000101351" "ENSDARG00000100006" "ENSDARG00000015551" ...
allIREGenes <- c(ireGenes$ire3_all, ireGenes$ire5_all)
allIREGenes %>% head
[1] "ENSDARG00000102878" "ENSDARG00000070477" "ENSDARG00000006038" "ENSDARG00000063481" "ENSDARG00000045842" "ENSDARG00000095332"
2. Compute gene set overlap with IRE genesets
We wish to see the overlap between the genesets in gs with the ireGenes (including 3’ and 5’ IRE genes), as well as separately, with the 3’ IRE genes and 5’ IRE genes. We will add this information into the gs tibble.
gs %<>% rowwise() %>% mutate(
n = length(ids), # Number of genes in the geneset
n_with_ire = sum(ids %in% allIREGenes), # Number of genes in the gene set which have 3' or 5' predicted IREs
n_without_ire = (n - n_with_ire),
universe_with_ire = sum(rownames(v) %in% allIREGenes & !(rownames(v) %in% ids)),
universe_without_ire = (length(rownames(v)) - universe_with_ire),
n_with_ire3 = sum(ids %in% ireGenes$ire3_all), # Number of genes in the gene set which have 3' predicted IREs
n_without_ire3= (n - n_with_ire),
universe_with_ire3 = sum(rownames(v) %in% ireGenes$ire3_all & !(rownames(v) %in% ids)),
universe_without_ire3 = (length(rownames(v)) - universe_with_ire),
n_with_ire5 = sum(ids %in% ireGenes$ire5_all), # Number of genes in the gene set which have 5' predicted IREs
n_without_ire5= (n - n_with_ire),
universe_with_ire5 = sum(rownames(v) %in% ireGenes$ire5_all & !(rownames(v) %in% ids)),
universe_without_ire5 = (length(rownames(v)) - universe_with_ire)
) %>% ungroup()
gs
3. Contingency tables
Create contingency table for each gene set. We will store this in a list of matrices gs_mat. The 3’ and 5’ contingency matrices are stored in gs_mat3 and gs_mat5.
gs_mat <- gs %>%
dplyr::select(n_with_ire,
n_without_ire,
universe_with_ire,
universe_without_ire) %>%
apply(X = ., MARGIN = 1, FUN = function(x){
x %>%
matrix(2,2) %>%
t %>%
list()
}) %>% set_names(gs$geneset) %>%
lapply(function(x){
x %>% .[[1]]
})
gs_mat3 <- gs %>%
dplyr::select(n_with_ire3,
n_without_ire3,
universe_with_ire3,
universe_without_ire3) %>%
apply(X = ., MARGIN = 1, FUN = function(x){
x %>%
matrix(2,2) %>%
t %>%
list()
}) %>% set_names(gs$geneset) %>%
lapply(function(x){
x %>% .[[1]]
})
gs_mat5 <- gs %>%
dplyr::select(n_with_ire5,
n_without_ire5,
universe_with_ire5,
universe_without_ire5) %>%
apply(X = ., MARGIN = 1, FUN = function(x){
x %>%
matrix(2,2) %>%
t %>%
list()
}) %>% set_names(gs$geneset) %>%
lapply(function(x){
x %>% .[[1]]
})
gs_mat[1:3]
$HALLMARK_TNFA_SIGNALING_VIA_NFKB
[,1] [,2]
[1,] 15 243
[2,] 1433 18274
$HALLMARK_HYPOXIA
[,1] [,2]
[1,] 19 241
[2,] 1430 18277
$HALLMARK_CHOLESTEROL_HOMEOSTASIS
[,1] [,2]
[1,] 5 86
[2,] 1443 18264
gs_mat3[1:3]
$HALLMARK_TNFA_SIGNALING_VIA_NFKB
[,1] [,2]
[1,] 9 243
[2,] 1119 18274
$HALLMARK_HYPOXIA
[,1] [,2]
[1,] 14 241
[2,] 1115 18277
$HALLMARK_CHOLESTEROL_HOMEOSTASIS
[,1] [,2]
[1,] 4 86
[2,] 1124 18264
gs_mat5[1:3]
$HALLMARK_TNFA_SIGNALING_VIA_NFKB
[,1] [,2]
[1,] 6 243
[2,] 344 18274
$HALLMARK_HYPOXIA
[,1] [,2]
[1,] 7 241
[2,] 343 18277
$HALLMARK_CHOLESTEROL_HOMEOSTASIS
[,1] [,2]
[1,] 2 86
[2,] 348 18264
4. Fisher’s exact test
On each contingency table, we will run Fisher’s exact test. We then apply FDR correction to adjust the raw p-values for multiple testing.
fisher_res <- gs_mat %>% lapply(function(x){
x %>% fisher.test()
})
fisher_res3 <- gs_mat3 %>% lapply(function(x){
x %>% fisher.test()
})
fisher_res5 <- gs_mat5 %>% lapply(function(x){
x %>% fisher.test()
})
fisher_res_p <- fisher_res %>% lapply(function(x){x$p.value})
fisher_res_p3 <- fisher_res3 %>% lapply(function(x){x$p.value})
fisher_res_p5 <- fisher_res5 %>% lapply(function(x){x$p.value})
gs %<>%
mutate(fisher_p = fisher_res_p%>%unlist%>%unname,
fisher_p_3 = fisher_res_p3%>%unlist%>%unname,
fisher_p_5 = fisher_res_p5%>%unlist%>%unname) %>%
mutate(fdr = p.adjust(fisher_p, "fdr"),
fdr_3 = p.adjust(fisher_p_3, "fdr"),
fdr_5 = p.adjust(fisher_p_5, "fdr"))
5. Calculate Expected and Observed
For each gene set, we will calculate the number of genes expected to have IREs (based on the background proportion) and observed value. This information will be added to the gs object.
gs %<>% rowwise() %>% mutate(
exp_allIRE = (universe_with_ire / universe_without_ire)*n_without_ire,
obs_allIRE = n_with_ire,
obs_greater_than_exp_allIRE = obs_allIRE > exp_allIRE,
exp_ire3 = (universe_with_ire3 / universe_without_ire3)*n_without_ire3,
obs_ire3 = n_with_ire3,
obs_greater_than_exp_ire3 = obs_ire3 > exp_ire3,
exp_ire5 = (universe_with_ire5 / universe_without_ire5)*n_without_ire5,
obs_ire5 = n_with_ire5,
obs_greater_than_exp_ire5 = obs_ire5 > exp_ire5
)
gs %>%
dplyr::select(geneset, contains("exp"), starts_with("obs"), ) %>% ungroup()
6. Results
The following table shows the top 50 gene sets enriched in IRE genesets, ranked by Fisher’s exact test p-values:
gs %>% ungroup() %>%
arrange(fisher_p) %>%
dplyr::select(geneset, contains("fdr"), n_with_ire, n, starts_with("obs_greater")) %>%
head(50)
Sorted by genesets most enriched in 3’ IRE genes:
gs %>% ungroup %>% arrange(fisher_p_3) %>% dplyr::select(geneset, contains("fdr"), n_with_ire3, n, starts_with("obs_greater")) %>% head(50)
Sorted by genesets most enriched in 5’ IRE genes:
gs %>% ungroup %>% arrange(fisher_p_5) %>% dplyr::select(geneset, contains("fdr"), n_with_ire5, n, starts_with("obs_greater")) %>% head(50)
7. Visualisation (Stacked bar chart)
The following stacked bar chart shows the overlap between the top ~20 genesets and the predicted-IRE genesets.
overlapDf <- gs %>% arrange(fisher_p) %>% dplyr::filter(fdr < 0.1 | fdr_3 < 0.1 | fdr_5 < 0.1) %>%
dplyr::filter(obs_greater_than_exp_allIRE == TRUE | obs_greater_than_exp_ire3 == TRUE | obs_greater_than_exp_ire5 == TRUE) %>%
dplyr::select(geneset, source, n_with_ire3, n_with_ire5, n_without_ire) %>%
bind_rows(data.frame(
geneset = c("- GRCz11 Zebrafish Genes - All Predicted IREs"),
source = c("sires"),
n_with_ire3 = c(length(ireGenes$ire3_all)),
# ireUtr3 %>% as.data.frame %>% dplyr::filter(quality == "High") %>% use_series("gene_id") %>% unique %>% length),
n_with_ire5 = c(length(ireGenes$ire5_all)),
# ireUtr5 %>% as.data.frame %>% dplyr::filter(quality == "High") %>% use_series("gene_id") %>% unique %>% length),
n_without_ire = c(0)
)) %>% dplyr::mutate(geneset = paste0(geneset, " ", source)) %>%
dplyr::mutate(geneset = gsub(x=geneset, pattern = "_", replacement = " ")) %>%
dplyr::select(-source) %>% melt
binding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorUsing geneset as id variables
overlapPlot <- overlapDf %>% ggplot(aes(x = str_to_title(stringr::str_wrap(geneset, 23)),
y = value,
fill = variable)) +
geom_col() +
theme(aspect.ratio = 0.7,
axis.text.x = element_text(size= 35),
axis.title.x = element_text(size= 30),
axis.title.y = element_text(size = 30),
legend.text = element_text(size = 20),
legend.title = element_text(size=30),
axis.text.y = element_text(color = "grey20", size = 13)) +
coord_flip() + # Axis labels too long to be readable so this helps.
scale_fill_manual(values=c("#FBB829", "#FF0066", "#dddddd"), labels =c("3' IRE", "5' IRE", "No IRE")) +
labs(y = "Number of genes", x = "Gene set", fill = "Gene\nhas\nIRE?")
overlapPlot

# export::graph2ppt(overlapPlot, here("R","GSEA","fig","overlapPlot"))
8. Visualisation (Network)
Although the stacked bar chart shows overlap between different gene sets with the predicted IRE geneset, along with information about the size of each geneset, crucially, it doesn’t indicate which genes are in common between the different gene sets. My first idea was pairwise Venn diagrams between the IRE genes in each pair of genesets, but this isn’t the most visual / intuitive way to present the information and it doesn’t highlight the genes which are most often shared between different genesets. Steve had the idea to represent this as a network, with distinct groups of genes (grey) representing genesets, IRE genes in colour, and edges representing overlap between genesets.
8.1. Create the nodes table
I will need to create a data.frame with the following columns:
- Id: Gene ID
- IRE: Factor which can either be
3' IRE, 5' IRE, or no IRE.
- Geneset: The gene set which it belongs to.
nodes<- gs %>% arrange(fisher_p) %>% dplyr::filter(fdr < 0.1 | fdr_3 < 0.1 | fdr_5 < 0.1) %>%
dplyr::filter(obs_greater_than_exp_allIRE == TRUE | obs_greater_than_exp_ire3 == TRUE | obs_greater_than_exp_ire5 == TRUE) %>%
ungroup
# Append information about 3' and 5' IRE genesets
nodes %<>% bind_rows(
tibble(
ids = c(list(ireGenes$ire3_all, ireGenes$ire5_all)),
geneset = c("Predicted 3' IRE genes", "Predicted 5' IRE genes"),
source = c("sires","sires"),
n = c(length(ireGenes$ire3_all), length(ireGenes$ire5_all))
)
)
# Append the Hallmark Heme Metabolism geneset
# Although this gene set wasnt significantly enriched in
# IRE-containing genes, it is probably the most comprehensive
# gene set specifically on heme metabolism, and combines info
# from various studies.
nodes %<>% bind_rows(
h_tib %>% filter(geneset == "HALLMARK_HEME_METABOLISM") %>% mutate(n = length(ids[[1]]))
)
nodes
First we filtered gs for only the genesets that are highly ranked in being enriched for predicted IRE genes. We applied the additional filtering step that the observed number of IRE genes is greater than the expected number. This results in 18 genesets.
The next step is to create all possible combinations of the genesets. We will use the combn function to do this. Then we will create a column common_ids to store the genes which are common to both gene sets being intersected.
gsComb <- combn(nodes$geneset, m = 2) %>% t %>% as.data.frame %>%
set_colnames(c("geneset", "geneset_2b")) %>%
left_join(nodes%>%dplyr::select(ids, geneset), by = "geneset") %>%
dplyr::rename(geneset_nm = geneset,
geneset = geneset_2b) %>%
left_join(nodes%>%dplyr::select(ids, geneset), by = "geneset") %>%
as_tibble %>%
mutate(common_ids = map2(ids.x, ids.y, ~intersect(.x,.y)))
Column `geneset` joining factor and character vector, coercing into character vectorColumn `geneset` joining factor and character vector, coercing into character vector
gsComb
We can extract genes which appear the most often in genesets as follows:
gsComb$common_ids%>%unlist %>% table %>% as.data.frame%>% arrange(desc(Freq)) %>% set_colnames(c("ensembl_gene_id", "f")) %>% left_join(v$genes %>% dplyr::select(-entrezid)) %>% as_tibble
Joining, by = "ensembl_gene_id"
Column `ensembl_gene_id` joining factor and character vector, coercing into character vector
The nodes table will contain
- All genes in the genesets
- The geneset names
pathways <- nodes$geneset %>% as.data.frame %>% set_colnames("label")
genes <- nodes$ids %>% unlist %>% unique %>% as.data.frame %>% set_colnames("label")
nodesDf <- full_join(pathways, genes, by = "label") %>% rowid_to_column("id")
Column `label` joining factors with different levels, coercing to character vector
# nodesDf %>%
# mutate(text = ifelse(id < 18, label, NA)) %>%
# mutate(size = ifelse(id < 16, 2, 1)) %>%
# mutate(colour = ifelse(id < 16, rainbow(26)[id], NA))
head(nodesDf,20)
id label
1 1 TTGCWCAAY_CEBPB_02
2 2 TGANTCA_AP1_C
3 3 YRTCANNRCGC_UNKNOWN
4 4 TGGAAA_NFAT_Q4_01
5 5 KEGG_GLIOMA
6 6 PID_BETA_CATENIN_NUC_PATHWAY
7 7 DACOSTA_UV_RESPONSE_VIA_ERCC3_DN
8 8 WTGAAAT_UNKNOWN
9 9 GO_INTRACELLULAR_SIGNAL_TRANSDUCTION
10 10 CTACTAG_MIR325
11 11 TGGNNNNNNKCCAR_UNKNOWN
12 12 GO_POSTSYNAPTIC_MEMBRANE
13 13 GO_SYNAPTIC_MEMBRANE
14 14 GO_POSITIVE_REGULATION_OF_BLOOD_VESSEL_ENDOTHELIAL_CELL_MIGRATION
15 15 TNCATNTCCYR_UNKNOWN
16 16 Predicted 3' IRE genes
17 17 Predicted 5' IRE genes
18 18 HALLMARK_HEME_METABOLISM
19 19 ENSDARG00000009418
20 20 ENSDARG00000029764
nodesDf2 <- nodesDf %>%
mutate(
ire = case_when(
id < 16 ~ " ",
id == 18 ~ " ",
id == 16 ~ "3",
id == 17 ~ "5",
label %in% ireGenes$ire3_all ~ "3",
label %in% ireGenes$ire5_all ~ "5",
!(label %in% allIREGenes) ~ "no IRE"
)
)%>%
dplyr::select(-id) %>% dplyr::rename(Id = label)
head(nodesDf2, 30)
Id ire
1 TTGCWCAAY_CEBPB_02
2 TGANTCA_AP1_C
3 YRTCANNRCGC_UNKNOWN
4 TGGAAA_NFAT_Q4_01
5 KEGG_GLIOMA
6 PID_BETA_CATENIN_NUC_PATHWAY
7 DACOSTA_UV_RESPONSE_VIA_ERCC3_DN
8 WTGAAAT_UNKNOWN
9 GO_INTRACELLULAR_SIGNAL_TRANSDUCTION
10 CTACTAG_MIR325
11 TGGNNNNNNKCCAR_UNKNOWN
12 GO_POSTSYNAPTIC_MEMBRANE
13 GO_SYNAPTIC_MEMBRANE
14 GO_POSITIVE_REGULATION_OF_BLOOD_VESSEL_ENDOTHELIAL_CELL_MIGRATION
15 TNCATNTCCYR_UNKNOWN
16 Predicted 3' IRE genes 3
17 Predicted 5' IRE genes 5
18 HALLMARK_HEME_METABOLISM
19 ENSDARG00000009418 5
20 ENSDARG00000029764 no IRE
21 ENSDARG00000077842 no IRE
22 ENSDARG00000040184 no IRE
23 ENSDARG00000026723 no IRE
24 ENSDARG00000033498 no IRE
25 ENSDARG00000071168 no IRE
26 ENSDARG00000069356 no IRE
27 ENSDARG00000075899 no IRE
28 ENSDARG00000111712 no IRE
29 ENSDARG00000090624 no IRE
30 ENSDARG00000055543 no IRE
# nodesDf2 %>% write_tsv(here("R","GSEA","data","nodes2.tsv"))
Edges will be between:
- The genesets and all genes within the geneset
- Genes in multiple genesets
edgeDf <- nodes$ids %>% set_names(nodes$geneset) %>% plyr::ldply(data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
left_join(nodesDf, by = c("pathway"="label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesDf, by = c("gene"="label")) %>%
dplyr::rename(to=id) %>%
#dplyr::select(from, to)
dplyr::select(pathway, gene) %>%
set_colnames(c("Source","Target"))
Column `gene`/`label` joining factor and character vector, coercing into character vector
edgeDf %>% head(20)
Source Target
1 TTGCWCAAY_CEBPB_02 ENSDARG00000009418
2 TTGCWCAAY_CEBPB_02 ENSDARG00000029764
3 TTGCWCAAY_CEBPB_02 ENSDARG00000077842
4 TTGCWCAAY_CEBPB_02 ENSDARG00000040184
5 TTGCWCAAY_CEBPB_02 ENSDARG00000026723
6 TTGCWCAAY_CEBPB_02 ENSDARG00000033498
7 TTGCWCAAY_CEBPB_02 ENSDARG00000071168
8 TTGCWCAAY_CEBPB_02 ENSDARG00000069356
9 TTGCWCAAY_CEBPB_02 ENSDARG00000075899
10 TTGCWCAAY_CEBPB_02 ENSDARG00000111712
11 TTGCWCAAY_CEBPB_02 ENSDARG00000090624
12 TTGCWCAAY_CEBPB_02 ENSDARG00000055543
13 TTGCWCAAY_CEBPB_02 ENSDARG00000104279
14 TTGCWCAAY_CEBPB_02 ENSDARG00000005416
15 TTGCWCAAY_CEBPB_02 ENSDARG00000077948
16 TTGCWCAAY_CEBPB_02 ENSDARG00000077749
17 TTGCWCAAY_CEBPB_02 ENSDARG00000051783
18 TTGCWCAAY_CEBPB_02 ENSDARG00000087517
19 TTGCWCAAY_CEBPB_02 ENSDARG00000014181
20 TTGCWCAAY_CEBPB_02 ENSDARG00000004843
Export results
gs %>% saveRDS(here("R","GSEA","results","gs.rds"))
gs %>% ungroup() %>% write.xlsx(here("R","GSEA","results","gs.xlsx"))
# Export significant genesets / of interest
gs %>%ungroup %>%
arrange(fisher_p) %>%
dplyr::filter(fdr < 0.1 | fdr_3 < 0.1 | fdr_5 < 0.1) %>%
dplyr::filter(obs_greater_than_exp_allIRE == TRUE | obs_greater_than_exp_ire3 == TRUE | obs_greater_than_exp_ire5 == TRUE) %>%
dplyr::select(geneset, source, n, n_with_ire3, n_with_ire5, n_without_ire, contains("fisher_p"), contains("fdr")) %>%
write.xlsx(here("R","GSEA","results","gs_topRanked.xlsx"))
9. Human Data
- Managed to get the sets of human genes containing IREs:
zebrafishIreGenes <- readRDS(here::here("R/GSEA/data/ireGenes.rds"))
humanIreGenes <- readRDS(here::here("R/IREGenes/data/human_ireGenes.rds"))
TODO
- [x] Fisher’s test separately on 3’ IRE genes and 5’ IRE genes.
- [x] Stacked bar chart to indicate overlap of the top ~20 genesets with predicted IRE genes.
- [ ] Repeat this but with human / mouse IREs and their genesets to do a cross-species comparison.
- [x] Add in calculation of Expected and Observed IRE gene values and filter significant results to have Observed > Expected.
- [x] Export nodes and edges table for network visualisation.
- [x] Export edges and nodes table again except with Hallmark Heme metabolism added in
Session Info
sessionInfo()
---
title: "Are IRE gene sets over-represented in MSigDB gene sets?"
output:
  html_document:
    df_print: paged
---

```{r Setup, include=FALSE, message=FALSE, warning=FALSE}
# Load packages
library(dplyr)
library(readr)
library(magrittr)
library(limma)
library(here)
library(openxlsx)
library(ggplot2)
library(stringr)
library(purrr)
library(export)
library(reshape2)
library(fgsea)
library(tibble)

# ggplot2 theme
theme_set(theme_bw())

# Load objects required for this analysis. 
# Gene set collections with human entrezgenes converted to zebrafish Ensembl IDs and saved as lists of gene sets. 
h_mapped <- readRDS(here("R","GSEA","data","ens_h_mapped.rds"))  # Hallmark Gene set collection 
c2_mapped <- readRDS(here("R","GSEA","data","ens_c2_mapped.rds"))  # Curated Gene set collection
c3_mapped <- readRDS(here("R","GSEA","data","ens_c3_mapped.rds"))  # Motif Gene set collection
c5_mapped <- readRDS(here("R","GSEA","data","ens_c5_mapped.rds"))  # Gene Ontology Gene set collection 
ireGenes <- readRDS(here("R","GSEA","data","ireGenes.rds")) # List of IRE gene sets, contains 4 gene sets: 3' All IRE genes, 5' All IRE genes, 3' High-Quality IRE genes, 5' High-Quality IRE genes 

# IRE genes GRanges objects exported from SIREs web server
ireUtr3 <- readRDS(here("R","IREGenes","data","ireUtr3.rds"))
ireUtr5 <- readRDS(here("R","IREGenes","data","ireUtr5.rds"))

# DE analysis objects
v <- readRDS(here("R","DE","data","voomData_g.rds"))  # voom object
design <- readRDS(here("R","DE","data","design_g.rds"))  # Design matrix
contrasts <- readRDS(here("R","DE","data","contrasts_g.rds"))  # Contrasts matrix

```


## 0. Introduction

The purpose of this analysis is to determine which gene sets from [MSigDB](http://software.broadinstitute.org/gsea/msigdb/index.jsp) 
are most enriched in the genes we previously predicted to contain IREs. This is important as we wanted to see the biological relevance of the IRE-containing genes, and also their overlap in existing gene sets. 

The approach is as follows:

- Load in relevant gene set collections from MSigDB.
- Get them into a format suitable for performing Fisher's exact test to test whether they are enriched for IRE-containing genes. 
- Results displayed on a bar chart and also as a network. 

The motivation for doing this analysis comes from an interesting observation using the naive approach of searching MSigDB for all gene sets with **iron** or **heme** in their name and seeing the overlap in genes between these gene sets and the IRE-containing genesets we identified. 

```{r}
ironSets <- list(
  c5_GO_2_IRON_2_SULFUR_CLUSTER_BINDING = c5_mapped$GO_2_IRON_2_SULFUR_CLUSTER_BINDING,
  c5_GO_4_IRON_4_SULFUR_CLUSTER_BINDING = c5_mapped$GO_4_IRON_4_SULFUR_CLUSTER_BINDING,
  c5_GO_CELLULAR_IRON_ION_HOMEOSTASIS = c5_mapped$GO_CELLULAR_IRON_ION_HOMEOSTASIS,
  c5_GO_HEME_METABOLIC_PROCESS = c5_mapped$GO_HEME_METABOLIC_PROCESS,
  c5_GO_HEME_BIOSYNTHETIC_PROCESS = c5_mapped$GO_HEME_BIOSYNTHETIC_PROCESS,
  c5_GO_HEMOGLOBIN_COMPLEX = c5_mapped$GO_HEMOGLOBIN_COMPLEX,
  c5_GO_IRON_COORDINATION_ENTITY_TRANSPORT = c5_mapped$GO_IRON_COORDINATION_ENTITY_TRANSPORT,
  c5_GO_IRON_ION_BINDING = c5_mapped$GO_IRON_ION_BINDING,
  c5_GO_IRON_ION_HOMEOSTASIS = c5_mapped$GO_IRON_ION_HOMEOSTASIS,
  c5_GO_IRON_ION_IMPORT = c5_mapped$GO_IRON_ION_IMPORT,
  c5_GO_IRON_ION_TRANSPORT = c5_mapped$GO_IRON_ION_TRANSPORT,
  c5_GO_OXIDOREDUCTASE_ACTIVITY_ACTING_ON_A_HEME_GROUP_OF_DONORS = c5_mapped$GO_OXIDOREDUCTASE_ACTIVITY_ACTING_ON_A_HEME_GROUP_OF_DONORS,
  c5_GO_RESPONSE_TO_IRON_ION = c5_mapped$GO_RESPONSE_TO_IRON_ION,
  h_HEME_METABOLISM = h_mapped$HALLMARK_HEME_METABOLISM,
  c2_REACTOME_IRON_UPTAKE_AND_TRANSPORT = c2_mapped$REACTOME_IRON_UPTAKE_AND_TRANSPORT
)

ironSets_idx <- ids2indices(ironSets, rownames(v))

head(ironSets,2)
head(ironSets_idx,2)
```

We used a gene set enrichment analysis approach to test for whether these iron-related gene sets were enriched in our DE results.

```{r, eval=FALSE}
source(here("R","GSEA","combinedGSEA.R"))

gseaResults_ironSets <- combinedGSEA(v, ironSets_idx, design, contrasts)
# gseaResults_ironSets %>% saveRDS(here("R","GSEA","results","gseaResults_ironSets.rds"))
```

Results for **6 month, normoxia, mutant vs. wild type** below indicate that while some iron-related gene sets show enrichment in this comparison, others do not. 
```{r eval=FALSE}
gseaResults_ironSets$combTest$normoxia_6mth_mutant_vs_wt
```

Overall, the figure below indicates poor overlap between existing gene sets and our IRE-containing gene sets. However, to really see how our IRE genesets relate to existing gene sets, we need to do a more comprehensive test including more of the gene sets not restricted to ones related to iron metabolism. 

```{r fig.width=11, fig.cap="Comparison of gene overlap between our IRE gene sets and existing gene sets related to iron metabolism."}
# Create a dataframe that includes overlap between iron-related gene sets and the ireGenes.
testx <- ironSets %>% lapply(function(x){
  overlap <- list(
  with3ire = sum(x %in% ireGenes$ire3_all),
  with5ire = sum(x %in% ireGenes$ire5_all),
  noIre = sum(!(x %in% ireGenes$ire3_all | x %in% ireGenes$ire5_all))
  )
}) %>% lapply(function(x){
    x %>% bind_rows
}) %>% 
  do.call("rbind",.) %>% 
  mutate(geneset = names(ironSets)) %>%
  bind_rows(data.frame(
    with3ire = length(ireGenes$ire3_all),
    with5ire = length(ireGenes$ire5_all),
    noIre = 0,
    geneset = "- GRCz11 Zebrafish Genes - All Predicted IREs"
  )) %>% 
  bind_rows(data.frame(
    with3ire = ireUtr3 %>% as.data.frame %>% dplyr::filter(quality == "High") %>% use_series("gene_id") %>% unique %>% length,
    with5ire = ireUtr5 %>% as.data.frame %>% dplyr::filter(quality == "High") %>% use_series("gene_id") %>% unique %>% length,
    noIre = 0,
    geneset = "- GRCz11 Zebrafish Genes - High Quality Predicted IREs"
  )) %>%
  mutate(geneset = gsub(x = geneset, pattern = "_", replacement = " "))%>%
  melt 

# Create a stacked bar chart
compPlot <- testx %>% ggplot(aes(x = str_to_title(stringr::str_wrap(geneset, 20)), 
                     y = value, 
                     fill = variable)) + 
  geom_col() +
  theme(aspect.ratio = 0.7, 
        axis.text.x =  element_text(size= 35), 
        axis.title.x = element_text(size= 30),
        axis.title.y = element_text(size = 30), 
        legend.text = element_text(size = 20), 
        legend.title = element_text(size=30),
        axis.text.y = element_text(color = "grey20", size = 16)) + 
  coord_flip() +  # Axis labels too long to be readable so this helps.
  scale_fill_manual(values=c("#FBB829", "#FF0066", "#dddddd"), labels =c("3' IRE", "5' IRE", "No IRE")) +
  labs(y = "Number of genes", x = "Gene set", fill = "Gene\nhas\nIRE?") 

compPlot

```





## 1. Defining Relevant Gene Sets

We will be testing whether our IRE gene sets are enriched in any of the gene sets in the following gene set collections from **MSigDB**:

- **Hallmark gene sets** (**H**) are coherently expressed signatures derived by aggregating many MSigDB gene sets to represent well-defined biological states or processes.
- **Curated gene sets** (**C2**) are from online pathway databases, publications in PubMed, and knowledge of domain experts.
- **Motif gene sets** (**C3**) are based on conserved cis-regulatory motifs from a comparative analysis of the human, mouse, rat, and dog genomes.
- **Gene ontology gene sets** (**C5**) consist of genes annotated by the same GO terms. 

The gene set collections mapped to zebrafish Ensembl IDs have been pre-loaded for this analysis. Each collection is a named list of gene sets, with each gene set containing Ensembl IDs of genes in that set. We will convert all lists into list column structures used in the *tibble* package. 

```{r}
h_tib <- h_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(h_mapped), source = "h") 
c2_tib <- c2_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c2_mapped), source = "c2")
c3_tib <- c3_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c3_mapped), source = "c3")
c5_tib <- c5_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c5_mapped), source = "c5")

gs <- bind_rows(h_tib, c2_tib, c3_tib, c5_tib)

gs
```

We previously loaded the `ireGenes` R object, which has the following structure:
```{r}
ireGenes%>%str

allIREGenes <- c(ireGenes$ire3_all, ireGenes$ire5_all)

allIREGenes %>% head
```

## 2. Compute gene set overlap with IRE genesets

We wish to see the overlap between the genesets in `gs` with the 
`ireGenes` (including 3' and 5' IRE genes), 
as well as separately, with the 3' IRE genes and 5' IRE genes. 
We will add this information into the `gs` tibble.

```{r}
gs %<>% rowwise() %>% mutate(
  n = length(ids),  # Number of genes in the geneset
  
  n_with_ire = sum(ids %in% allIREGenes),  # Number of genes in the gene set which have 3' or 5' predicted IREs 
  n_without_ire = (n - n_with_ire),
  universe_with_ire = sum(rownames(v) %in% allIREGenes & !(rownames(v) %in% ids)), 
  universe_without_ire = (length(rownames(v)) - universe_with_ire),
  
  n_with_ire3 = sum(ids %in% ireGenes$ire3_all),  # Number of genes in the gene set which have 3' predicted IREs
  n_without_ire3= (n - n_with_ire),
  universe_with_ire3 = sum(rownames(v) %in% ireGenes$ire3_all & !(rownames(v) %in% ids)), 
  universe_without_ire3 = (length(rownames(v)) - universe_with_ire),
  
  n_with_ire5 = sum(ids %in% ireGenes$ire5_all),  # Number of genes in the gene set which have 5' predicted IREs
  n_without_ire5= (n - n_with_ire),
  universe_with_ire5 = sum(rownames(v) %in% ireGenes$ire5_all & !(rownames(v) %in% ids)), 
  universe_without_ire5 = (length(rownames(v)) - universe_with_ire)
) %>% ungroup()

gs
```

## 3. Contingency tables

Create contingency table for each gene set. We will store this in a 
list of matrices `gs_mat`. 
The 3' and 5' contingency matrices are stored in `gs_mat3` and `gs_mat5`. 

```{r, eval=FALSE}
gs_mat <- gs %>% 
  dplyr::select(n_with_ire,
                n_without_ire, 
                universe_with_ire, 
                universe_without_ire) %>% 
  apply(X = ., MARGIN = 1, FUN = function(x){
    x %>% 
      matrix(2,2) %>% 
      t %>% 
      list()
  }) %>% set_names(gs$geneset) %>% 
  lapply(function(x){
    x %>% .[[1]]
  })

gs_mat3 <- gs %>% 
  dplyr::select(n_with_ire3,
                n_without_ire3, 
                universe_with_ire3, 
                universe_without_ire3) %>% 
  apply(X = ., MARGIN = 1, FUN = function(x){
    x %>% 
      matrix(2,2) %>% 
      t %>% 
      list()
  }) %>% set_names(gs$geneset) %>% 
  lapply(function(x){
    x %>% .[[1]]
  })

gs_mat5 <- gs %>% 
  dplyr::select(n_with_ire5,
                n_without_ire5, 
                universe_with_ire5, 
                universe_without_ire5) %>% 
  apply(X = ., MARGIN = 1, FUN = function(x){
    x %>% 
      matrix(2,2) %>% 
      t %>% 
      list()
  }) %>% set_names(gs$geneset) %>% 
  lapply(function(x){
    x %>% .[[1]]
  })

```
```{r}
gs_mat[1:3]
gs_mat3[1:3]
gs_mat5[1:3]
```


## 4. Fisher's exact test

On each contingency table, we will run Fisher's exact test. 
We then apply FDR correction to adjust the raw *p*-values for multiple testing. 

```{r,eval=FALSE}
fisher_res <- gs_mat %>% lapply(function(x){
  x %>% fisher.test()
})

fisher_res3 <- gs_mat3 %>% lapply(function(x){
  x %>% fisher.test()
})

fisher_res5 <- gs_mat5 %>% lapply(function(x){
  x %>% fisher.test()
})

fisher_res_p <- fisher_res %>% lapply(function(x){x$p.value})
fisher_res_p3 <- fisher_res3 %>% lapply(function(x){x$p.value})
fisher_res_p5 <- fisher_res5 %>% lapply(function(x){x$p.value})

gs %<>% 
  mutate(fisher_p = fisher_res_p%>%unlist%>%unname,
         fisher_p_3 = fisher_res_p3%>%unlist%>%unname,
         fisher_p_5 = fisher_res_p5%>%unlist%>%unname) %>%
  mutate(fdr = p.adjust(fisher_p, "fdr"),
         fdr_3 = p.adjust(fisher_p_3, "fdr"),
         fdr_5 = p.adjust(fisher_p_5, "fdr"))

```


## 5. Calculate Expected and Observed

For each gene set, we will calculate the number of genes **expected** to have 
IREs (based on the background proportion) and **observed** value. 
This information will be added to the `gs` object.

```{r eval=FALSE}
gs %<>% rowwise() %>% mutate(
  exp_allIRE = (universe_with_ire / universe_without_ire)*n_without_ire,
  obs_allIRE = n_with_ire,
  obs_greater_than_exp_allIRE = obs_allIRE > exp_allIRE,
  
  exp_ire3 = (universe_with_ire3 / universe_without_ire3)*n_without_ire3,
  obs_ire3 = n_with_ire3,
  obs_greater_than_exp_ire3 = obs_ire3 > exp_ire3,
  
  exp_ire5 = (universe_with_ire5 / universe_without_ire5)*n_without_ire5,
  obs_ire5 = n_with_ire5,
  obs_greater_than_exp_ire5 = obs_ire5 > exp_ire5
) 
```

```{r}
gs %>%
  dplyr::select(geneset, contains("exp"), starts_with("obs"), ) %>% ungroup()
```


## 6. Results

The following table shows the top 50 gene sets enriched in IRE genesets, 
ranked by Fisher's exact test *p*-values:

```{r}
gs %>% ungroup() %>%
  arrange(fisher_p) %>%
  dplyr::select(geneset, contains("fdr"), n_with_ire, n, starts_with("obs_greater")) %>% 
  head(50)
```

Sorted by genesets most enriched in 3' IRE genes:
```{r}
gs %>% ungroup %>% arrange(fisher_p_3) %>% dplyr::select(geneset, contains("fdr"), n_with_ire3, n, starts_with("obs_greater")) %>% head(50)
```

Sorted by genesets most enriched in 5' IRE genes:
```{r}
gs %>% ungroup %>% arrange(fisher_p_5) %>% dplyr::select(geneset, contains("fdr"), n_with_ire5, n, starts_with("obs_greater")) %>% head(50)
```


## 7. Visualisation (Stacked bar chart)

The following stacked bar chart shows the overlap between the top ~20 
genesets and the predicted-IRE genesets.

```{r fig.width=11,fig.height=8}
overlapDf <- gs %>% arrange(fisher_p) %>% dplyr::filter(fdr < 0.1 | fdr_3 < 0.1 | fdr_5 < 0.1) %>%
  dplyr::filter(obs_greater_than_exp_allIRE == TRUE | obs_greater_than_exp_ire3 == TRUE | obs_greater_than_exp_ire5 == TRUE) %>%
  dplyr::select(geneset, source, n_with_ire3, n_with_ire5, n_without_ire) %>%
  bind_rows(data.frame(
    geneset = c("- GRCz11 Zebrafish Genes - All Predicted IREs"),
    source = c("sires"),
    n_with_ire3 = c(length(ireGenes$ire3_all)),
#                    ireUtr3 %>% as.data.frame %>% dplyr::filter(quality == "High") %>% use_series("gene_id") %>% unique %>% length),
    n_with_ire5 = c(length(ireGenes$ire5_all)),
#                    ireUtr5 %>% as.data.frame %>% dplyr::filter(quality == "High") %>% use_series("gene_id") %>% unique %>% length),
    n_without_ire = c(0)
  )) %>% dplyr::mutate(geneset = paste0(geneset, " ", source)) %>%
  dplyr::mutate(geneset = gsub(x=geneset, pattern = "_", replacement = " ")) %>%
  dplyr::select(-source) %>% melt

overlapPlot <- overlapDf %>% ggplot(aes(x = str_to_title(stringr::str_wrap(geneset, 23)), 
                     y = value, 
                     fill = variable)) + 
  geom_col() +
  theme(aspect.ratio = 0.7, 
        axis.text.x =  element_text(size= 35), 
        axis.title.x = element_text(size= 30),
        axis.title.y = element_text(size = 30), 
        legend.text = element_text(size = 20), 
        legend.title = element_text(size=30),
        axis.text.y = element_text(color = "grey20", size = 13)) + 
  coord_flip() +  # Axis labels too long to be readable so this helps.
  scale_fill_manual(values=c("#FBB829", "#FF0066", "#dddddd"), labels =c("3' IRE", "5' IRE", "No IRE")) +
  labs(y = "Number of genes", x = "Gene set", fill = "Gene\nhas\nIRE?") 

overlapPlot

# export::graph2ppt(overlapPlot, here("R","GSEA","fig","overlapPlot"))
```

## 8. Visualisation (Network)

Although the stacked bar chart shows overlap between different gene sets with the predicted IRE geneset, along with information about the size of each geneset, crucially, it doesn't indicate which genes are in common between the different gene sets. My first idea was pairwise Venn diagrams between the IRE genes in each pair of genesets, but this isn't the most visual / intuitive way to present the information and it doesn't highlight the genes which are most often shared between different genesets. Steve had the idea to represent this as a network, with distinct groups of genes (grey) representing genesets, IRE genes in colour, and edges representing overlap between genesets. 


### 8.1. Create the nodes table

I will need to create a `data.frame` with the following columns:

- **Id**: Gene ID
- **IRE**: Factor which can either be `3' IRE`, `5' IRE`, or `no IRE`. 
- **Geneset**: The gene set which it belongs to. 

```{r}
nodes<- gs %>% arrange(fisher_p) %>% dplyr::filter(fdr < 0.1 | fdr_3 < 0.1 | fdr_5 < 0.1) %>%
  dplyr::filter(obs_greater_than_exp_allIRE == TRUE | obs_greater_than_exp_ire3 == TRUE | obs_greater_than_exp_ire5 == TRUE) %>%
  ungroup

# Append information about 3' and 5' IRE genesets 
nodes %<>% bind_rows(
   tibble(
     ids = c(list(ireGenes$ire3_all, ireGenes$ire5_all)),
     geneset = c("Predicted 3' IRE genes", "Predicted 5' IRE genes"),
     source = c("sires","sires"),
     n = c(length(ireGenes$ire3_all), length(ireGenes$ire5_all))
   )
 ) 

# Append the Hallmark Heme Metabolism geneset
# Although this gene set wasnt significantly enriched in 
# IRE-containing genes, it is probably the most comprehensive
# gene set specifically on heme metabolism, and combines info 
# from various studies. 
nodes %<>% bind_rows(
  h_tib %>% filter(geneset == "HALLMARK_HEME_METABOLISM") %>% mutate(n = length(ids[[1]]))
 ) 

nodes
```

First we filtered `gs` for only the genesets that are highly ranked in being enriched for predicted IRE genes. We applied the additional filtering step that the `observed` number of IRE genes is greater than the `expected` number. This results in `r nrow(nodes)` genesets. 

The next step is to create all possible combinations of the genesets. We will use the `combn` function to do this. Then we will create a column `common_ids` to store the genes which are common to both gene sets being intersected. 

```{r}
gsComb <- combn(nodes$geneset, m = 2) %>% t %>% as.data.frame %>%
  set_colnames(c("geneset", "geneset_2b")) %>%
  left_join(nodes%>%dplyr::select(ids, geneset), by = "geneset") %>%
  dplyr::rename(geneset_nm = geneset,
                geneset = geneset_2b) %>%
  left_join(nodes%>%dplyr::select(ids, geneset), by = "geneset") %>%
  as_tibble %>%
  mutate(common_ids = map2(ids.x, ids.y, ~intersect(.x,.y)))

gsComb 
```

We can extract genes which appear the most often in genesets as follows:

```{r}
gsComb$common_ids%>%unlist %>% table %>% as.data.frame%>% arrange(desc(Freq)) %>% set_colnames(c("ensembl_gene_id", "f")) %>% left_join(v$genes %>% dplyr::select(-entrezid)) %>% as_tibble
```

The nodes table will contain

- All genes in the genesets
- The geneset names

```{r}
pathways <- nodes$geneset %>% as.data.frame %>% set_colnames("label")
genes <- nodes$ids %>% unlist %>% unique %>% as.data.frame %>% set_colnames("label")
nodesDf <- full_join(pathways, genes, by = "label") %>% rowid_to_column("id")

# nodesDf %>%
#   mutate(text = ifelse(id < 18, label, NA)) %>%
#   mutate(size = ifelse(id < 16, 2, 1)) %>%
#   mutate(colour = ifelse(id < 16, rainbow(26)[id], NA)) 

head(nodesDf,20)

nodesDf2 <- nodesDf %>%
  mutate(
    ire = case_when(
      id < 16 ~ " ",
      id == 18 ~ " ",
      id == 16 ~ "3",
      id == 17 ~ "5",
      label %in% ireGenes$ire3_all ~ "3",
      label %in% ireGenes$ire5_all ~ "5",
      !(label %in% allIREGenes) ~ "no IRE"
    )
  )%>%
  dplyr::select(-id) %>% dplyr::rename(Id = label) 

head(nodesDf2, 30) 
# nodesDf2 %>% write_tsv(here("R","GSEA","data","nodes2.tsv"))
```

Edges will be between:

- The genesets and all genes within the geneset 
- Genes in multiple genesets

```{r}
edgeDf <- nodes$ids %>% set_names(nodes$geneset) %>% plyr::ldply(data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  left_join(nodesDf, by = c("pathway"="label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesDf, by = c("gene"="label")) %>%
  dplyr::rename(to=id) %>%
  #dplyr::select(from, to)
  dplyr::select(pathway, gene) %>%
  set_colnames(c("Source","Target"))
edgeDf %>% head(20)


```


## Export results

```{r eval=FALSE}
gs %>% saveRDS(here("R","GSEA","results","gs.rds"))
gs %>% ungroup() %>% write.xlsx(here("R","GSEA","results","gs.xlsx"))

# Export significant genesets / of interest
gs %>%ungroup %>% 
  arrange(fisher_p) %>%
  dplyr::filter(fdr < 0.1 | fdr_3 < 0.1 | fdr_5 < 0.1) %>%
  dplyr::filter(obs_greater_than_exp_allIRE == TRUE | obs_greater_than_exp_ire3 == TRUE | obs_greater_than_exp_ire5 == TRUE) %>%
  dplyr::select(geneset, source, n, n_with_ire3, n_with_ire5, n_without_ire, contains("fisher_p"), contains("fdr")) %>%
  write.xlsx(here("R","GSEA","results","gs_topRanked.xlsx"))
```

## 9. Human Data

- Managed to get the sets of human genes containing IREs:

```{r}
zebrafishIreGenes <- readRDS(here::here("R/GSEA/data/ireGenes.rds"))
humanIreGenes <- readRDS(here::here("R/IREGenes/data/human_ireGenes.rds"))
```


## TODO

- [x] Fisher's test separately on 3' IRE genes and 5' IRE genes. 
- [x] Stacked bar chart to indicate overlap of the top ~20 genesets with predicted IRE genes. 
  - [ ] Repeat this but with human / mouse IREs and their genesets to do a cross-species comparison. 
- [x] Add in calculation of Expected and Observed IRE gene values and filter significant results to have Observed > Expected. 
- [x] Export nodes and edges table for network visualisation. 
- [x] Export edges and nodes table again except with Hallmark Heme metabolism added in

## Session Info

```{r}
sessionInfo()
```

